The imputation of missing values in clinical variables will be done using the MICE (Multiple Imputation by Chained Equations) method.
The R package used to implement it is called ‘mice’.
https://www.rdocumentation.org/packages/mice/versions/3.16.0/topics/mice
# Deactivate scientific notation
options(scipen = 999)
# Load libraries
pacman::p_load(
an9elproject,
tidyverse,
lubridate,
magrittr,
naniar,
mice,
mctest,
plotly,
install = FALSE, update = FALSE
)
# Load cohort database
oncoth1 = get_project("oncothr1", version = "0.0.8003")
# Get data slot
oncoth1_data = oncoth1$data
# How many variables contain missing values?
n_var_miss(oncoth1_data)
## [1] 342
Let’s see which of the clinical variables have missing values and which should be imputed to gain statistical power.
# Visualize % of missing values by variable
oncoth1_data %>%
# Exclude these variables
select(!c(n_appointment_patient_became_case,
patient_left_study,
anticoag_tx_lmwh_dosage,
anticoag_tx_vte_drugs,
vte_before_entering_study)) %>%
# Exclude free text variables
select(!contains("reason")) %>%
# Exclude date variables
select(!contains("date")) %>%
# Exclude variables relative to VTE
select(!starts_with("type_")) %>% # VTE type
select(!starts_with("cancer_concomitant_")) %>% # VTE concomitant to cancer diagnosis
select(!starts_with("eval_")) %>% # VTE diagnosed during study
select(!contains("_study")) %>%
select(!contains("vte_dx_during_cancer_follow_up")) %>%
select(!starts_with("rec_")) %>% # VTE recurrences
select(!ends_with("_type_recurrence")) %>%
# Exclude variables with time until event
select(!starts_with("tu_")) %>%
# Exclude genetic variables
select(!starts_with("rs")) %>%
# Exclude variables related to ONCOTHROMB score
select(!c(starts_with("ONCOTHROMB"), "GRS")) %>%
# Exclude variables with no missing values
select_if( ~ any(is.na(.))) %>%
# Plot % of missing values
gg_miss_var(., show_pct = TRUE)
# Summary of dataset, including percentage of missing values
naniar_summary = miss_var_summary(oncoth1_data)
# Show variables with missing values
naniar_summary_missing_values = naniar_summary %>%
filter(!pct_miss == 0)
# Heatmap with missing data patterns
# Useful for investigating any structure of missing observations in the data
md.pattern(
oncoth1_data %>% select(
# Select clinical variables of interest
albumin,
aptt,
aptt_ratio,
bilirubin,
creatinine,
alkaline_phosphatase,
tobacco_use,
copd,
venous_insufficiency,
catheter_device,
creatinine,
oral_contraceptive_tx),
plot = TRUE,
rotate.names = TRUE
)
## creatinine copd venous_insufficiency catheter_device oral_contraceptive_tx
## 185 1 1 1 1 1
## 34 1 1 1 1 1
## 9 1 1 1 1 1
## 36 1 1 1 1 1
## 9 1 1 1 1 1
## 15 1 1 1 1 1
## 5 1 1 1 1 1
## 49 1 1 1 1 1
## 2 1 1 1 1 1
## 5 1 1 1 1 1
## 4 1 1 1 1 1
## 3 1 1 1 1 1
## 13 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 4 1 1 1 1 1
## 1 1 1 1 1 1
## 5 1 1 1 1 1
## 2 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 1
## 1 1 1 1 1 0
## 1 1 1 1 0 1
## 1 1 1 0 1 1
## 1 1 0 1 1 1
## 1 0 1 1 1 1
## 1 1 1 1 1
## tobacco_use bilirubin alkaline_phosphatase albumin aptt aptt_ratio
## 185 1 1 1 1 1 1 0
## 34 1 1 1 1 1 0 1
## 9 1 1 1 1 0 1 1
## 36 1 1 1 1 0 0 2
## 9 1 1 1 0 1 1 1
## 15 1 1 1 0 1 0 2
## 5 1 1 1 0 0 1 2
## 49 1 1 1 0 0 0 3
## 2 1 1 0 1 1 0 2
## 5 1 1 0 1 0 0 3
## 4 1 1 0 0 1 0 3
## 3 1 1 0 0 0 1 3
## 13 1 1 0 0 0 0 4
## 1 1 0 1 0 0 0 4
## 1 1 0 0 1 0 0 4
## 4 1 0 0 0 1 0 4
## 1 1 0 0 0 0 1 4
## 5 1 0 0 0 0 0 5
## 2 0 1 1 1 1 1 1
## 1 0 1 1 1 0 0 3
## 1 0 0 0 0 0 0 6
## 1 1 1 1 0 1 0 3
## 1 1 1 0 1 0 0 4
## 1 1 1 1 0 0 0 4
## 1 1 1 1 1 0 0 3
## 1 1 1 1 1 0 0 3
## 4 13 40 112 134 176 484
All these variables have missing values missing not at random (MNAR);
except aptt and aptt_ratio.
Virtually, every dataset contains some parts that could better be removed before imputation. This includes, but is not limited to, uninteresting variables with a high proportion of missing data, variables without a code for the missing data, administrative variables, constant variables, duplicated, recoded or standardized variables, and aggregates and indices of other information.
# Select variables for imputation
data_for_imputation = oncoth1_data %>%
select(id,
patient_group,
patient_status_at_end_study,
age_when_cancer_dx,
gender,
menopausal_status,
pregnancy,
oral_contraceptive_tx,
weight,
height,
body_surface_area, # collinear with bmi and somewhat with creatinine
bmi_value,
bmi_category, #
performance_status_category_corrected,
albumin,
aptt,
aptt_ratio,
bilirubin,
creatinine,
alkaline_phosphatase,
hemoglobin,
inr,
leukocytes,
platelets,
leukocytosis,
thrombocytosis,
low_hemoglobin,
tobacco_use,
copd,
venous_insufficiency,
primary_tumor_simplified,
progression_according_to_clinical_stage,
tnm_stage,
t_stage,
n_stage,
histology_type,
mucinous_histology,
grade_histological_differentiation,
metastasis_dx,
n_metastases,
catheter_device,
khorana_risk_score,
tic_onco
)
# Change data
data_for_imputation %<>%
# Get first word of patient status
# We only want to know if they are alive, dead or unkown
mutate(patient_status_at_end_study = word(patient_status_at_end_study, 1)) %>%
# Convert to factor
mutate(patient_status_at_end_study = as.factor(patient_status_at_end_study))
# Look for collinear data
# Compute correlation of numeric variables
numeric_collinearity = cor(
Filter(is.numeric, data_for_imputation),
use = "pairwise.complete.obs"
)
# Show heatmap and dendrogram
heatmap(numeric_collinearity)
# Diagonal correlation plot for numeric variables
Filter(is.numeric, data_for_imputation) %>% # Get only numeric variables
cor(use = "pairwise.complete.obs") %>% # Calculate correlation
ggcorrplot::ggcorrplot(
show.diag = FALSE,
type = "lower",
lab = TRUE,
lab_size = 2,
tl.cex = 10
)
# Correlation plot between numeric and categorical variables
data_for_imputation %>%
select(-c(
id,
patient_group,
patient_status_at_end_study,
bmi_category,
oral_contraceptive_tx,
pregnancy,
low_hemoglobin,
leukocytosis,
thrombocytosis,
t_stage,
n_stage,
metastasis_dx
)) %>% # Remove ID and redundant variables
# Transform some factors into numeric
mutate(across(c(
performance_status_category_corrected,
khorana_risk_score,
n_metastases), ~ as.numeric(.x))) %>%
# Transform histological grade
mutate(grade_histological_differentiation = case_when(
grade_histological_differentiation == "Well differentiated" ~ 1,
grade_histological_differentiation == "Moderately differentiated" ~ 2,
grade_histological_differentiation == "Poorly differentiated"~ 3)) %>%
# Filter(is.factor, .) %>% # Get only factors
model.matrix(~0+., data = .) %>% # One-hot encoding (no intercept)
cor(use = "pairwise.complete.obs") %>%
as.data.frame() %>% # Convert matrix to data frame
# Create a lower triangular matrix
{lower_tri <- function(m) {
m[lower.tri(m, diag = FALSE)] <- NA
m
}}() %>%
# Create an interactive heatmap with Plotly
plot_ly(
x = colnames(.),
y = rownames(.),
z = as.matrix(.),
type = "heatmap",
colors = colorRamp(c("blue", "white", "red")), # Adjust color scale as needed
colorbar = list(title = "Correlation")
)
The influx of a variable quantifies how well its missing data connects to the observed data on other variables. The outflux of a variable quantifies how well its observed data connect to the missing data on other variables. Variables with higher outflux are (potentially) the more powerful predictors. Variables with higher influx depend strongly on the imputation model.
# Calculate flux
dataset_flux = flux(data_for_imputation)
# Plot influx/outflux
fluxplot(data_for_imputation, ylim = c(0, 1.05), cex = 0.7, eqscplot = TRUE)
The group at the left-upper corner has (almost) complete information, so the number of missing data problems for this group is relatively small. The intermediate group has an outflux between 0.5 and 0.8, which is small. Missing data problems are more severe, but potentially this group could contain important variables. The third group has an outflux with 0.5 and lower, so its predictive power is limited.
Variables that might cause problems later on in the imputations are located in the lower-right corner.
Most points are relatively close to the diagonal, which indicates that influx and outflux are balanced.
Useful links:
https://cran.r-project.org/web/packages/finalfit/vignettes/missing.html
https://datascienceplus.com/handling-missing-data-with-mice-package-a-simple-approach/
# Initialise MICE imputation creating a mids object
init = mice(data_for_imputation, maxit = 0, seed = 2828)
## Warning: Number of logged events: 3
# Imputation method for each variable
# Detects oral_contraceptive_tx as collinear and eliminates it from imputation
meth = init$method
# Predictor matrix
predM = init$predictorMatrix
As a general rule, using every bit of available information yields multiple imputations that have minimal bias and maximal efficiency.
It is often beneficial to choose as large a number of predictors as possible. Including as many predictors as possible tends to make the missing at random (MAR) assumption more plausible, thus reducing the need to make special adjustments for MNAR mechanisms.
For imputation purposes, it is expedient to select a suitable subset of data that contains no more than 15 to 25 variables.
# Show imputation method that will be used for each of the variables in the dataset
# Variables with no missing data will be assigned no method
# Yes/no variables are imputed using logistic regression
# Categorical variables with more than two levels are imputed with 'polyreg' (polytomous logistic regression)
# Numerical variables are imputed using 'pmm' (predictive mean matching)
print(meth)
## id patient_group
## "" ""
## patient_status_at_end_study age_when_cancer_dx
## "" ""
## gender menopausal_status
## "" "polyreg"
## pregnancy oral_contraceptive_tx
## "logreg" ""
## weight height
## "" ""
## body_surface_area bmi_value
## "" ""
## bmi_category performance_status_category_corrected
## "" "polyreg"
## albumin aptt
## "pmm" "pmm"
## aptt_ratio bilirubin
## "pmm" "pmm"
## creatinine alkaline_phosphatase
## "pmm" "pmm"
## hemoglobin inr
## "" "pmm"
## leukocytes platelets
## "pmm" ""
## leukocytosis thrombocytosis
## "logreg" ""
## low_hemoglobin tobacco_use
## "" "polyreg"
## copd venous_insufficiency
## "logreg" "logreg"
## primary_tumor_simplified progression_according_to_clinical_stage
## "" "polyreg"
## tnm_stage t_stage
## "" "polyreg"
## n_stage histology_type
## "polyreg" "polyreg"
## mucinous_histology grade_histological_differentiation
## "logreg" "polyreg"
## metastasis_dx n_metastases
## "" ""
## catheter_device khorana_risk_score
## "logreg" "polyreg"
## tic_onco
## "logreg"
# Use of quickpred to obtain prediction matrix
# Selects predictors of variables to be imputed according to simple statistics
# Produces square matrix, with 0/1 values
# One is used to indicate that a specific variable will be used as predictor for another one
predM = quickpred(
data_for_imputation,
mincor = 0.2,
minpuc = 0.5,
include = c("patient_group", "patient_status_at_end_study")
)
# Tweak prediction matrix by setting to zero uninformative variables
# These will not be used as predictors for imputation
predM[, c("id",
#"gender", # Do not remove if you want post-imputation to work correctly
"albumin",
"inr",
"aptt",
"aptt_ratio",
"oral_contraceptive_tx",
"grade_histological_differentiation",
"t_stage",
"metastasis_dx",
"n_metastases")] = 0
# Distribution of number of predictions
table(rowSums(predM))
##
## 0 2 3 4 5 6 7 8 9 10 11
## 18 4 3 5 2 1 3 3 1 2 1
# The names of the predictors for any give variable can be obtained by
names(data_for_imputation)[predM["copd", ] == 1]
## [1] "patient_group" "patient_status_at_end_study"
## [3] "tobacco_use"
This means that the predictors for imputing missing values in
copd will be patient_group,
patient_status_at_end_study and
tobacco_use.
# Order in which the MICE algorithm will go through the variables
visit = init$visitSequence
visit
## [1] "id"
## [2] "patient_group"
## [3] "patient_status_at_end_study"
## [4] "age_when_cancer_dx"
## [5] "gender"
## [6] "menopausal_status"
## [7] "pregnancy"
## [8] "oral_contraceptive_tx"
## [9] "weight"
## [10] "height"
## [11] "body_surface_area"
## [12] "bmi_value"
## [13] "bmi_category"
## [14] "performance_status_category_corrected"
## [15] "albumin"
## [16] "aptt"
## [17] "aptt_ratio"
## [18] "bilirubin"
## [19] "creatinine"
## [20] "alkaline_phosphatase"
## [21] "hemoglobin"
## [22] "inr"
## [23] "leukocytes"
## [24] "platelets"
## [25] "leukocytosis"
## [26] "thrombocytosis"
## [27] "low_hemoglobin"
## [28] "tobacco_use"
## [29] "copd"
## [30] "venous_insufficiency"
## [31] "primary_tumor_simplified"
## [32] "progression_according_to_clinical_stage"
## [33] "tnm_stage"
## [34] "t_stage"
## [35] "n_stage"
## [36] "histology_type"
## [37] "mucinous_histology"
## [38] "grade_histological_differentiation"
## [39] "metastasis_dx"
## [40] "n_metastases"
## [41] "catheter_device"
## [42] "khorana_risk_score"
## [43] "tic_onco"
# Post-imputation
post <- init$post
# Gender-specific variables
# If gender is "Male", use value "Male"
post["menopausal_status"] <- "imp[[j]][data$gender[!r[, j]] == 'Male', i] <- 'Male'"
post["pregnancy"] <- "imp[[j]][data$gender[!r[, j]] == 'Male', i] <- 'Male'"
post["oral_contraceptive_tx"] <- "imp[[j]][data$gender[!r[, j]] == 'Male', i] <- 'Male'"
# Leukocytes' levels
# Patients with leukocytosis will have 11,000x10^9/L
post["leukocytes"] <- "imp[[j]][data$leukocytosis[!r[, j]] == 'Yes', i] <- 11000"
# Patients with more than 11,000x10^9/L will have leukocytosis
post["leukocytosis"] <- "imp[[j]][data$leukocytes[!r[, j]] > 11000, i] <- 'Yes'"
# Run imputation process
imputed = mice(
data = data_for_imputation,
method = meth,
predictorMatrix = predM,
visitSequence = visit,
post = post, # Apply post-processing changes
m = 10, # 10 rounds of multiple imputation
seed = 2828
)
##
## iter imp variable
## 1 1 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 1 2 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 1 3 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 1 4 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 1 5 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 1 6 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 1 7 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 1 8 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 1 9 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 1 10 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 2 1 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 2 2 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 2 3 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 2 4 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 2 5 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 2 6 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 2 7 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 2 8 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 2 9 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 2 10 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 3 1 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 3 2 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 3 3 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 3 4 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 3 5 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 3 6 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 3 7 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 3 8 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 3 9 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 3 10 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 4 1 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 4 2 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 4 3 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 4 4 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 4 5 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 4 6 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 4 7 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 4 8 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 4 9 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 4 10 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 5 1 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 5 2 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 5 3 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 5 4 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 5 5 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 5 6 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 5 7 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 5 8 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 5 9 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## 5 10 menopausal_status pregnancy performance_status_category_corrected albumin aptt aptt_ratio bilirubin creatinine alkaline_phosphatase inr leukocytes leukocytosis tobacco_use copd venous_insufficiency progression_according_to_clinical_stage t_stage n_stage histology_type mucinous_histology grade_histological_differentiation catheter_device khorana_risk_score tic_onco
## Warning: Number of logged events: 151
# Plots for inspecting imputation process through iterations
plot(imputed, c(
"menopausal_status",
"performance_status_category_corrected",
"albumin",
"aptt",
"aptt_ratio",
"bilirubin",
"creatinine",
"alkaline_phosphatase",
"inr",
"leukocytes",
"leukocytosis",
"pregnancy",
# "oral_contraceptive_tx", # not found because no imputation is done on this variable
"tobacco_use",
"venous_insufficiency",
"progression_according_to_clinical_stage",
"t_stage",
"n_stage",
"histology_type",
"mucinous_histology",
"grade_histological_differentiation",
"catheter_device",
"khorana_risk_score",
"tic_onco"
))
# Return completed data after imputation
whole_imputed_data = complete(imputed)
# Plot missing data patterns in imputed dataset
# No missing values should be present
md.pattern(whole_imputed_data, rotate.names = TRUE)
## id patient_group patient_status_at_end_study age_when_cancer_dx gender
## 389 1 1 1 1 1
## 1 1 1 1 1 1
## 0 0 0 0 0
## menopausal_status pregnancy weight height body_surface_area bmi_value
## 389 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 0 0 0 0 0 0
## bmi_category performance_status_category_corrected albumin aptt aptt_ratio
## 389 1 1 1 1 1
## 1 1 1 1 1 1
## 0 0 0 0 0
## bilirubin creatinine alkaline_phosphatase hemoglobin inr leukocytes
## 389 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 0 0 0 0 0 0
## platelets leukocytosis thrombocytosis low_hemoglobin tobacco_use copd
## 389 1 1 1 1 1 1
## 1 1 1 1 1 1 1
## 0 0 0 0 0 0
## venous_insufficiency primary_tumor_simplified
## 389 1 1
## 1 1 1
## 0 0
## progression_according_to_clinical_stage tnm_stage t_stage n_stage
## 389 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## histology_type mucinous_histology grade_histological_differentiation
## 389 1 1 1
## 1 1 1 1
## 0 0 0
## metastasis_dx n_metastases catheter_device khorana_risk_score tic_onco
## 389 1 1 1 1 1
## 1 1 1 1 1 1
## 0 0 0 0 0
## oral_contraceptive_tx
## 389 1 0
## 1 0 1
## 1 1
For some reason, there is one row where
oral_contraceptive_tx still has one missing value. This has
probably something to do with the fact that this variable is constant
(no patient had oral contraceptives during the study) and no imputation
method was asigned.
# Plot density of numerical variable distributions -- both observed and imputed data
densityplot(imputed)
Curves resulting from the iterations of imputation process are overall quite similar to the observed data (although there is some variability). This is a good sign.
# Matrix with all imputed values through all iterations in one dataframe
imputation_data_long = complete(imputed, "long")
# Create columns with imputed data
whole_imputed_data %<>%
mutate(across(c(menopausal_status,
performance_status_category_corrected,
albumin,
aptt,
aptt_ratio,
bilirubin,
creatinine,
alkaline_phosphatase,
inr,
leukocytes,
leukocytosis,
tobacco_use,
copd,
venous_insufficiency,
progression_according_to_clinical_stage,
t_stage,
n_stage,
histology_type,
mucinous_histology,
grade_histological_differentiation,
metastasis_dx,
catheter_device,
khorana_risk_score,
tic_onco), ~ .x, .names = "{col}_imp")) %>%
select(id | ends_with("_imp"))
# Save results in RData format
# save(whole_imputed_data, file = "oncoth1_whole_imputed_data.RData")